Code
# import libraries and define global settings
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# pandas/seaborn for ex12+
import pandas as pd
import seaborn as sns
# setting seed:
np.random.seed(10)# import libraries and define global settings
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# pandas/seaborn for ex12+
import pandas as pd
import seaborn as sns
# setting seed:
np.random.seed(10)library(ggplot2)
library(dplyr)
library(tidyr)
library(glue)
library(stringr)
library(purrr)
library(patchwork)
set.seed(10)
# common theme for all plots:
myTheme = theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
axis.title = element_text(size=15),
axis.text = element_text(size=15),
title = element_text(size=15),
legend.title = element_blank(),
legend.box.background = element_rect(color="gray", linewidth = 1),
legend.text = element_text(size = 12),
legend.key = element_blank(),
legend.key.width = unit(1, "cm"),
)# Figure 11.1: Goals of the t-test ----
## panel A: one-sample t-test ----
data = rnorm(n = 30, mean = .5)
# convert to DF:
data = tibble(data_index = 1:length(data), data_value = data)
# plot:
pA = ggplot(data=data, aes(x=data_index, y = data_value)) +
geom_point(
size=5,
shape=21,
color = "black",
fill = "gray"
) +
geom_hline(yintercept = 0, linetype="dashed") +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
axis.title = element_text(size=15),
axis.text = element_text(size=15),
title = element_text(size=15)
) +
labs(title = bquote(bold("A)")~"One sample"),
x = "Data index", y = "Data value")
## panel B: paired-samples t-test ----
N = 20
data1 = rnorm(n=N)
data2 = data1 + .5 + rnorm(N)*.4
pB = ggplot() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
axis.title = element_text(size=15),
axis.text = element_text(size=15, color="black"),
title = element_text(size=15)
)
for (i in 1:N) {
# i =1
data = tibble(data_index = factor(c(0, 1)),
data_value = c(data1[[i]], data2[[i]]))
# pick a random color
rgb_code = runif(1,min = 0, max = .8)
color_ = rgb(rgb_code, rgb_code, rgb_code)
# plotting
pB = pB +
aes(x=data_index, y=data_value, group=1) +
geom_line(data=data) +
geom_point(
data=data,
size = 5,
color = color_,
fill = color_,
shape=21)
}
pB = pB +
scale_x_discrete(labels = c("pre", "post")) +
labs(title = bquote(bold("B)")~"Paired samples"),
x = "", y = "Data value")
## panel C: two-samples t-test ----
pC = ggplot() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line = element_line(colour = "black"),
axis.title = element_text(size=15),
title = element_text(size=15),
legend.position = c(.9, .9),
legend.title = element_blank(),
legend.box.background = element_rect(color="gray", linewidth = 1),
legend.text = element_text(size = 12),
legend.key = element_blank(),
legend.key.width = unit(1, "cm"),
)
for (i in seq(0, 1)) {
# i=1
data = rnorm(1000, i, (i+1)/2)
# histogram
nbreaks_ = nclass.FD(data)
hist_ = hist(data, breaks = nbreaks_, plot=F)
# convert to DF:
data = tibble(data_index = hist_$mids,
data_value = hist_$counts,
category=glue("Group {i+1}"))
# plot
pC = pC +
aes(x=data_index, y = data_value, color=category) +
geom_line(
data=data,
linewidth=2
)}
pC = pC +
scale_color_manual(values=c("Group 1" = rgb(0, 0, 0),
"Group 2" = rgb(0.5, 0.5, 0.5)
)) +
labs(title = bquote(bold("C)")~"Two ind. samples"),
x = "Exam score", y = "Count")
p11.1 = pA + pB + pC + plot_layout(ncol=3)
p11.1# saving
ggsave("ttest_ttestGoals.png", p11.1, width=18, height = 5)# Figure 11.2: A t-pdf ----
t = seq(-4, 4, length=573)
# a pdf with df=20
tpdf = dt(t, df = 20)
# convert to DF:
data = tibble(data_index=t,
data_value=tpdf)
p_ttest_tpdf = ggplot(data=data, aes(x=data_index, y=data_value)) +
geom_line() +
myTheme +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(y="Probability", x = "T value")
p_ttest_tpdf# saving:
ggsave(pt, "ttest_tpdf.png")## Computing p-values for one-tailed and two-tailed tests ----
tval = 2.1
df = 13
pvalL = pt(-tval, df, lower.tail = T)
pvalR = pt(tval, df, lower.tail = F)
pval2 = pvalR + pvalL
print(glue("One-tailed p-value on the left: {pvalL}"))One-tailed p-value on the left: 0.0279063021356289
print(glue("One-tailed p-value on the rigth: {pvalR}"))One-tailed p-value on the rigth: 0.0279063021356289
print(glue("Two-tailed p-value as the sum: {pvalR + pvalL}"))Two-tailed p-value as the sum: 0.0558126042712578
print(glue("Two-tailed p-value by doubling: {2*pvalL}"))Two-tailed p-value by doubling: 0.0558126042712578
# 1-cdf vs survival function
pvalS = pt(tval, df, lower.tail = F) # survival
pvalC = 1 - pt(tval, df, lower.tail = T) # 1-cdf
print(glue("P-value from 1-cdf: {pvalC}"))P-value from 1-cdf: 0.0279063021356289
print(glue("P-value from s.f.: {pvalS}"))P-value from s.f.: 0.0279063021356289
print(glue("Difference: {pvalC - pvalS}"))Difference: 5.55111512312578e-17
Conclusion: The difference for this particular t-value is at machine precision. Still, there’s no harm in being slightly more accurate, so you can use sf instead of 1-cdf.
# Figure 11.3: T-values from p-values ----
t = seq(-4, 4, length=75)
df = 13
# cdf based on t-values
cdf = pt(t, df) # pt is the cdf
# convert to DF:
cdf = tibble(
data_index = t,
data_value = cdf
)
# t-values based on cdf
pvals = seq(.001, .999, length=73)
tVals = qt(pvals, df) # qt is inverse cdf
# convert to DF
tVals = tibble(
data_index = pvals,
data_value = tVals
)
pA = ggplot(
data = cdf,
aes(x=data_index, y = data_value)
) +
geom_line(
linewidth=2
) +
myTheme +
labs(y="cdf", x = "t value",
title=bquote(bold("A)")~"CDF from t-values"))
pB = ggplot(
data = tVals,
aes(x=data_index, y = data_value)
) +
geom_line(
linewidth=2
) +
myTheme +
labs(y="t value", x = "cdf",
title=bquote(bold("B)")~"T-values from CDF"))
p11.3 = pA + pB + plot_layout(ncol=2)
p11.3ggsave("ttest_tFromP.png", p11.3, width=10, height=4)# example usage to get the t-value associated with p=.05 and df=13
pval = .05
tFromP_L = qt(pval, df, lower.tail = T) # inverse of P(X <= x)
tFromP_R1 = qt(1-pval, df, lower.tail = T) # inverse of P(X <= x)
tFromP_R2 = qt(pval, df, lower.tail = F) # inverse of P(X > x)
print(glue('Variable tFromP_L: {sprintf("%.3f",tFromP_L)}'))Variable tFromP_L: -1.771
print(glue('Variable tFromP_R1: {sprintf("%.3f", tFromP_R1)}'))Variable tFromP_R1: 1.771
print(glue('Variable tFromP_R2: {sprintf("%.3f", tFromP_R2)}'))Variable tFromP_R2: 1.771
# empirical t-value and df
tval = 1.6
df = 20
alpha = .05
# redefine the t-values and corresponding pdf
t = np.linspace(-4,4,573)
tpdf = stats.t.pdf(t,20)
# its associated p-value (but this is one-tailed for visualization; see text and next cell!)
pval = 1-stats.t.cdf(tval,df)
# critical t-value for alpha
tCrit = stats.t.isf(alpha/2,df) # /2 for two-tailed!
pHalf = np.max(tpdf)/2 # 1/2 max. (vertical) p(t), used for plotting
plt.figure(figsize=(7,4))
# plot the t distribution
plt.plot(t,tpdf,'k',linewidth=1,label=r'$t_{20}$-pdf under H$_0$')
# plot the dashed line for the critical t-value
plt.axvline(tCrit,linestyle='--',color='gray')
plt.text(tCrit-.02,pHalf*2,r'$\alpha/2$ = %g'%(alpha/2),rotation=90,va='top',ha='right')
# arrow and formula for the empirical t-value
plt.gca().annotate(r'$t_{df}=\frac{\overline{x}-h_0}{s/\sqrt{n}}$=%g'%tval,xytext=(tval+1,pHalf),
xy=(tval,0), xycoords='data',size=18,
arrowprops=dict(arrowstyle='->', color='k',linewidth=2,
connectionstyle='angle,angleA=0,angleB=-90,rad=0'))
# shaded area to the right of the empirical t-value
tidx = np.argmin(np.abs(t-tval))
plt.gca().fill_between(t[tidx:],tpdf[tidx:],color='k',alpha=.4)
# and its annotation
tidx = np.argmin(np.abs(t-(tval+t[-1])/2))
plt.gca().annotate(f'{100*pval:.2f}%',xy=(t[tidx],tpdf[tidx]),
xytext=(t[tidx]+1,pHalf/2),ha='center',arrowprops={'color':'k'})
# some final adjustments
plt.xlabel('T value')
plt.ylabel(r'$p(t|H_0)$')
plt.yticks([])([], [])
plt.ylim([-.01,np.max(tpdf)*1.05])(-0.01, 0.41368801499700436)
plt.xlim([-1,t[-1]])(-1.0, 4.0)
plt.ylim([0,pHalf*2.1])(0.0, 0.41368801499700436)
plt.tight_layout()
plt.savefig('ttest_tEmpWEq.png')
plt.show()# Figure 11.4: Example t-value ----
# empirical t-value and df:
tval = 1.6
df = 20
alpha_ = .05
# redifine the t-values and corresponding pdf
t = seq(-4, 4, length=573)
tpdf = dt(t, df = 20)
# its associated p-value (but this is one-tailed for visualization; see text and next cell!)
pval = pt(tval, df, lower.tail = F)
# critical t-value for apha
tCrit = qt(alpha_/2, df = df, lower.tail = F) # /2 for two-tailed!
pHalf = max(tpdf)/2 # 1/2 max. (vertical) p(t), used for plotting
data = tibble(
data_index=t,
data_value = tpdf
)
tidx = which.min(abs(t - (tval + t[length(t)])/2))
p = ggplot(data, aes(x=data_index, y=data_value)) +
geom_line(
linewidth=1
) +
geom_ribbon(
data = filter(data, data_index >= tval),
aes(ymax = data_value, ymin=0),
fill="#969696") +
geom_vline(
xintercept = tCrit,
linetype = "dashed",
color="gray"
) +
annotate(
"text",
x = tCrit-0.2, y = pHalf*2,
label = bquote(alpha~"/2" == .(alpha_/2)),
angle = 90
) +
annotate(
"segment",
x = t[tidx] + 1,
xend = t[tidx],
y = pHalf/2,
yend = tpdf[tidx-12],
colour = "black",
linewidth = 2,
linejoin="mitre",
lineend="butt",
arrow = arrow(length = unit(.1, "inches"),
type="closed")
) +
annotate(
"text",
x = (t[tidx]+1), y = 1.1*(pHalf/2),
label = bquote(.(sprintf("%.2f", 100*pval))~"%"),
) +
geom_segment(
x = tval, xend = tval,
y = pHalf, yend = 0,
colour = "black",
arrow = arrow(length = unit(.1, "inches"))
) +
geom_segment(
x = tval+1, xend = tval,
y = pHalf, yend = pHalf,
colour = "black"
) +
annotate(
"text", size=6,
x = 1.25*(t[tidx]), y = pHalf,
label = bquote(t[df]~"="~frac(bar(x)-mu,'s /'~sqrt(n))~"="~ .(tval))
) +
xlim(-1, t[length(t)]) +
ylim(0, pHalf*2.1) +
myTheme +
labs(y = bquote(rho ~ "(t/" ~ H[0] ~ ")"),
x = "T value")
p ggsave("ttest_tEmpWEq.png", p, width=7, height=5)plt.figure(figsize=(7,4))
# plot the t distribution
plt.plot(t,tpdf,'k',linewidth=1,label=r'$t_{20}$-pdf under H$_0$')
# plot the dashed line for the critical t-value on the right side
plt.axvline(tCrit,linestyle='--',color='gray')
plt.text(tCrit-.02,pHalf*2,r'$\alpha/2$ = %g'%(alpha/2),rotation=90,va='top',ha='right')
# and again for the left side
plt.axvline(-tCrit,linestyle='--',color='gray')
plt.text(-tCrit+.028,pHalf*2,r'$\alpha/2$ = %g'%(alpha/2),rotation=90,va='top',ha='left')
# arrow and formula for the empirical t-value
plt.gca().annotate(r'$t=$%g'%tval,xytext=(tval,pHalf),
xy=(tval,0), xycoords='data',size=18,ha='center',bbox=dict(fc='w',edgecolor='none'),
arrowprops=dict(arrowstyle='->', color='k',linewidth=2))
# repeat on the left
plt.gca().annotate(r'$t=$-%g'%tval,xytext=(-tval,pHalf),
xy=(-tval,0), xycoords='data',size=18,ha='center',bbox=dict(fc='w',edgecolor='none'),
arrowprops=dict(arrowstyle='->', color='k',linewidth=2))
# shaded area to the right of the empirical t-value
tidx = np.argmin(np.abs(t-tval))
plt.gca().fill_between(t[tidx:],tpdf[tidx:],color='k',alpha=.4)
tidx = np.argmin(np.abs(t--tval))
plt.gca().fill_between(t[:tidx],tpdf[:tidx],color='k',alpha=.4)
# and its annotation for the right side
tidx = np.argmin(np.abs(t-(tval+t[-1])/2))
plt.gca().annotate(f'{100*pval:.2f}%',xy=(t[tidx],tpdf[tidx]),xytext=(t[tidx]+1,pHalf/2),ha='center',arrowprops={'color':'k'})
# now for the left side
tidx = np.argmin(np.abs(t-(-tval+t[0])/2))
plt.gca().annotate(f'{100*pval:.2f}%',xy=(t[tidx],tpdf[tidx]),xytext=(t[tidx]-.5,pHalf/2),ha='center',arrowprops={'color':'k'})
# some final adjustments
plt.xlabel('T value')
plt.ylabel(r'$p(t|H_0)$')
plt.yticks([])([], [])
plt.ylim([-.01,np.max(tpdf)*1.05])(-0.01, 0.41368801499700436)
plt.xlim(t[[0,-1]])(-4.0, 4.0)
plt.ylim([0,pHalf*2.1])(0.0, 0.41368801499700436)
plt.tight_layout()
plt.show()# Figure 11.5: Completion of the previous figure to show both tails ----
# plot the t distribution
p11.5 = ggplot(data, aes(x=data_index, y=data_value)) +
geom_line(
linewidth=1
)
# plot shaded area and dashed line for the critical t-value on the right side
p11.5 = p11.5 +
geom_ribbon(
data = filter(data, data_index >= tval),
aes(ymax = data_value, ymin=0),
fill="#969696") +
geom_vline(
xintercept = tCrit,
linetype = "dashed",
color="gray"
) +
annotate(
"text",
x = tCrit-0.2, y = pHalf*2,
label = bquote(alpha~"/2" == .(alpha_/2)),
angle = 90
) +
geom_segment(
x = tval, xend = tval,
y = pHalf, yend = 0,
colour = "black",
arrow = arrow(length = unit(.1, "inches"))
) +
geom_label(
aes(x=tval, y = 1.05*pHalf,
label="label",
colour="white")
) +
annotate(
"label",
size=6,
x = tval, y = 1.05*pHalf,
label.size=0,
label = glue("t={tval}")
)
# # and again for the left side
p11.5 =
p11.5 +
geom_ribbon(
data = filter(data, data_index <= -tval),
aes(ymax = data_value, ymin=0),
fill="#969696") +
geom_vline(
xintercept = -tCrit,
linetype = "dashed",
color="gray"
) +
annotate(
"text",
x = -tCrit+0.2, y = pHalf*2,
label = bquote(alpha~"/2" == .(alpha_/2)),
angle = 90
) +
geom_segment(
x = -tval, xend = -tval,
y = pHalf, yend = 0,
colour = "black",
arrow = arrow(length = unit(.1, "inches"))
) +
geom_label(
aes(x=-tval, y = 1.05*pHalf,
label="label",
colour="white")
) +
annotate(
"label",
size=6,
x = -tval, y = 1.05*pHalf,
label.size=0,
label = glue("t={tval}")
)
## arrow and formula for the empirical t-value
# left:
tidx_left = which.min(abs(t-(-tval+t[1])/2))
p11.5 = p11.5 +
annotate(
"segment",
x = t[tidx_left]-.5,
xend = t[tidx_left],
y = pHalf/2,
yend = tpdf[tidx_left+12],
colour = "black",
linewidth = 2,
linejoin="mitre",
lineend="butt",
arrow = arrow(length = unit(.1, "inches"),
type="closed")
) +
annotate(
"text",
x = (t[tidx_left]-.5), y = 1.1*(pHalf/2),
label = bquote(.(sprintf("%.2f", 100*pval))~"%"),
)
# right:
tidx_right = which.min(abs(t-(tval+t[length(t)])/2))
p11.5 = p11.5 + annotate(
"segment",
x = t[tidx_right] + 1,
xend = t[tidx_right],
y = pHalf/2,
yend = tpdf[tidx_right-12],
colour = "black",
linewidth = 2,
linejoin="mitre",
lineend="butt",
arrow = arrow(length = unit(.1, "inches"),
type="closed")
) +
annotate(
"text",
x = (t[tidx_right]+1), y = 1.1*(pHalf/2),
label = bquote(.(sprintf("%.2f", 100*pval))~"%"),
)
# final adjustments
p11.5 = p11.5 +
ylim(0, pHalf*2.1) +
myTheme +
theme(legend.position = "none",
axis.ticks.y = element_blank(),
axis.text.y = element_blank()) +
labs(y = bquote(rho ~ "(t/" ~ H[0] ~ ")"),
x = "T value")
p11.5ggsave("ttest_tEmpWEq2.png", p11.5, width=8, height=6)# the data
data1 = np.random.randn(100)
data2 = np.exp( np.random.randn(100) )
# omnibus test
Otest1 = stats.normaltest(data1)
Otest2 = stats.normaltest(data2)
# Shapiro's test
Stest1 = stats.shapiro(data1)
Stest2 = stats.shapiro(data2)
# report the results
print(f'Omnibus test in X1 (H0=normal): p={Otest1.pvalue:.3f}')Omnibus test in X1 (H0=normal): p=0.842
print(f'Omnibus test in X2 (H0=normal): p={Otest2.pvalue:.3f}')Omnibus test in X2 (H0=normal): p=0.000
print('')print(f'Shapiro test in X1 (H0=normal): p={Stest1.pvalue:.3f}')Shapiro test in X1 (H0=normal): p=0.405
print(f'Shapiro test in X2 (H0=normal): p={Stest2.pvalue:.3f}')Shapiro test in X2 (H0=normal): p=0.000
# show the histograms
yy1,xx1 = np.histogram(data1,bins='fd')
xx1 = (xx1[1:]+xx1[:-1])/2
yy2,xx2 = np.histogram(data2,bins='fd')
xx2 = (xx2[1:]+xx2[:-1])/2
# plotting
plt.figure(figsize=(4,4))
plt.plot(xx1,yy1,'k--',linewidth=3,label=r'$X_1$')
plt.plot(xx2,yy2,linewidth=3,color=(.5,.5,.5),label=r'$X_2$')
plt.gca().set(xlabel='Data value',ylabel='Count')
plt.legend()
plt.tight_layout()
#plt.savefig('ttest_normTests.png')
plt.show()# Figure 11.6: Testing for normality ----
# the data
data1 = rnorm(100)
data2 = exp(rnorm(100))
# omnibus test:
Otest1 = nortest::pearson.test(data1)
Otest2 = nortest::pearson.test(data2)
# Shapiro's test:
Stest1 = shapiro.test(data1)
Stest2 = shapiro.test(data2)
# report the results:
print(glue("Omnibus test in X1 (H0=normal): p={sprintf('%.3f', Otest1$p.value)}"))Omnibus test in X1 (H0=normal): p=0.639
print(glue("Omnibus test in X2 (H0=normal): p={sprintf('%.3f', Otest2$p.value)}"))Omnibus test in X2 (H0=normal): p=0.000
print(glue("Shapiro test in X1 (H0=normal): p={sprintf('%.3f', Stest1$p.value)}"))Shapiro test in X1 (H0=normal): p=0.840
print(glue("Shapiro test in X2 (H0=normal): p={sprintf('%.3f', Stest2$p.value)}"))Shapiro test in X2 (H0=normal): p=0.000
# show the histograms
nbreaks1 = nclass.FD(data1)
data1_df = hist(data1, breaks = nbreaks1, plot = F)
data1_df = tibble(
data_index = data1_df$mids,
data_value = data1_df$counts
)
nbreaks2 = nclass.FD(data2)
data2_df = hist(data2, breaks = nbreaks2, plot = F)
data2_df = tibble(
data_index = data2_df$mids,
data_value = data2_df$counts
)
# plotting:
p11.6 = ggplot() +
geom_line(
data=data1_df,
aes(x=data_index, y=data_value,
linetype="X1", color="X1"),
linewidth=2
) +
geom_line(
data=data2_df,
aes(x=data_index, y=data_value,
linetype="X2", color="X2"),
linewidth=2
)
# final adjustments:
p11.6 = p11.6 +
scale_color_manual(
values = c("X1" = "black",
"X2" = "gray")
) +
scale_linetype_manual(
values = c("X1" = "dashed",
"X2" = "solid")
) +
myTheme +
theme(legend.position = c(.8, .8),
legend.key.width = unit(2, "cm"),) +
labs(color="", linetype="",
y="Count", x="Data value")
p11.6# saving;
ggsave("ttest_normTests.png", p11.6, width=4, height=4)x = np.linspace(-4,4,501)
_,axs = plt.subplots(1,3,figsize=(10,4))
## panel A: probably not significant
g1 = stats.norm.pdf(x,loc=-.3,scale=1)
g2 = stats.norm.pdf(x,loc= .3,scale=1)
axs[0].plot(x,g1,color='k',linewidth=2)
axs[0].plot(x,g2,'--',color=(.6,.6,.6),linewidth=2)
axs[0].set(xticks=[],xlim=x[[0,-1]],yticks=[],ylabel='Probability',
title=r'$\bf{A}$) Non-significant')
## panel B: significant by larger mean difference
g1 = stats.norm.pdf(x,loc=-1,scale=1)
g2 = stats.norm.pdf(x,loc= 1,scale=1)
axs[1].plot(x,g1,color='k',linewidth=2)
axs[1].plot(x,g2,'--',color=(.6,.6,.6),linewidth=2)
axs[1].set(xticks=[],xlim=x[[0,-1]],yticks=[],ylabel='Probability',
title=r'$\bf{B}$) Large mean distance')
## panel C: significant by reduced variance
g1 = stats.norm.pdf(x,loc=-.3,scale=.2)
g2 = stats.norm.pdf(x,loc= .3,scale=.2)
axs[2].plot(x,g1,color='k',linewidth=2)
axs[2].plot(x,g2,'--',color=(.6,.6,.6),linewidth=2)
axs[2].set(xticks=[],xlim=x[[0,-1]],yticks=[],xlabel='Data value',ylabel='Probability',
title=r'$\bf{C}$) Low variance')
plt.tight_layout()
plt.savefig('ttest_sigMecs.png')
plt.show()# Figure 11.7: Increasing the t-value ----
x = seq(-4, 4, length=501)
## panel A: probably not significant
g1_A = tibble(data_index = x,
data_value = dnorm(x, -.3, 1))
g2_A = tibble(data_index = x,
data_value = dnorm(x, .3, 1))
p11.7_A = ggplot() +
geom_line(
data=g1_A,
aes(x=data_index, y=data_value,
linetype="g1", color="g1"),
linewidth=1
) +
geom_line(
data=g2_A,
aes(x=data_index, y=data_value,
linetype="g2", color="g2"),
linewidth=1
) +
scale_color_manual(
values = c("g1" = "black",
"g2" = "gray")
) +
scale_linetype_manual(
values = c("g1" = "solid",
"g2" = "dashed")
) +
myTheme +
theme(
legend.position = "none",
axis.ticks = element_blank(),
axis.text = element_blank()
) +
labs(title=bquote(bold("A)")~"Non-significant"),
x="",y="Probability")
# Panel B: significant by larger mean difference
g1_B = tibble(data_index=x, data_value=dnorm(x, -1, 1))
g2_B = tibble(data_index=x, data_value=dnorm(x, 1, 1))
p11.7_B = ggplot() +
geom_line(
data=g1_B,
aes(x=data_index, y=data_value,
linetype="g1", color="g1"),
linewidth=1
) +
geom_line(
data=g2_B,
aes(x=data_index, y=data_value,
linetype="g2", color="g2"),
linewidth=1
) +
scale_color_manual(
values = c("g1" = "black",
"g2" = "gray")
) +
scale_linetype_manual(
values = c("g1" = "solid",
"g2" = "dashed")
) +
myTheme +
theme(
legend.position = "none",
axis.ticks = element_blank(),
axis.text = element_blank()
) +
labs(title=bquote(bold("B)")~"Large mean distance"),
x="",y="Probability")
# panel C: significant by reduced variance
g1_C = tibble(data_index=x, data_value=dnorm(x, -.3, .2))
g2_C = tibble(data_index=x, data_value=dnorm(x, .3, .2))
p11.7_C = ggplot() +
geom_line(
data=g1_C,
aes(x=data_index, y=data_value,
linetype="g1", color="g1"),
linewidth=1
) +
geom_line(
data=g2_C,
aes(x=data_index, y=data_value,
linetype="g2", color="g2"),
linewidth=1
) +
scale_color_manual(
values = c("g1" = "black",
"g2" = "gray")
) +
scale_linetype_manual(
values = c("g1" = "solid",
"g2" = "dashed")
) +
myTheme +
theme(
legend.position = "none",
axis.ticks = element_blank(),
axis.text = element_blank()
) +
labs(title=bquote(bold("C)")~"Low variance"),
x="",y="Probability")
p11.7 = p11.7_A + p11.7_B + p11.7_C + plot_layout(ncol=3)
p11.7ggsave("ttest_sigMecs.png", p11.7, width=10, height = 4)# given data
X = np.array([80, 85, 90, 70, 75, 72, 88, 77, 82, 65, 79, 81, 74, 86, 68])
h0 = 75
# descriptives
meanX = np.mean(X)
stdX = np.std(X,ddof=1)
ssize = len(X)
# t-value
tval = (meanX-h0) / (stdX/np.sqrt(ssize))
# p-value
pval = 1-stats.t.cdf(tval,ssize-1)
pval *= 2 # two-tailed!
# print everything out!
print(f'Sample mean: {meanX:.2f}')
print(f'Sample std: {stdX:.2f}')
print(f'Sample size: {ssize}')
#print('')
print(f'T-value: {tval:.3f}')
print(f'p-value: {pval:.3f}')
# Repeat using the stats libary:
ttest = stats.ttest_1samp(X,h0)
# the output variable is its own type
print( type(ttest) )
# which contains three elements:
print('')
print(ttest)
# let's print the results
print('')
print('Results from stats.ttest_1samp:')
print(f't({ttest.df})={ttest.statistic:.3f}, p<{ttest.pvalue:.3f}')
# btw, data are consistent with a normal distribution
print(f'Shapiro p-value = {stats.shapiro(X).pvalue:.2f}')# One-sample T-test ----
# given data
X = c(80, 85, 90, 70, 75, 72, 88, 77, 82, 65, 79, 81, 74, 86, 68)
h0 = 75
# descriptives
meanX = mean(X)
stdX = sd(X)
ssize = length(X)
# t-value
tval = (meanX - h0) / (stdX/sqrt(ssize))
# p-value
pval = 1 - pt(tval, ssize-1)
pval = 2*pval
# print everything out!
print(sprintf("Sample mean: %.2f", meanX))[1] "Sample mean: 78.13"
print(sprintf("Sample std: %.2f", stdX))[1] "Sample std: 7.47"
print(glue("Sample size: {ssize}"))Sample size: 15
print(sprintf("T-value: %.3f", tval))[1] "T-value: 1.624"
print(sprintf("p-value: %.3f", pval))[1] "p-value: 0.127"
# Repeat using the stats library:
ttest = t.test(X, mu=h0)
# the output variable is its own type:
print(class(ttest))[1] "htest"
# let's print the results:
print("Results from t.test:")[1] "Results from t.test:"
print(sprintf("t(%d) = %.3f, p<%.3f", ttest$parameter, ttest$statistic, ttest$p.value))[1] "t(14) = 1.624, p<0.127"
# btw, data are consistent with a normal distribution
print(sprintf("Shapiro p-value = %.2f", shapiro.test(X)$p.value))[1] "Shapiro p-value = 0.96"
# the data
Xn = np.array([ 60, 52, 90, 20, 33, 95, 18, 47, 78, 65 ])
Xq = np.array([ 65, 60, 84, 23, 37, 95, 17, 53, 88, 66 ])
sampsize = len(Xn)
# their difference
Delta = Xq-Xn
# visualize
_,axs = plt.subplots(1,2,figsize=(7,3))
## draw the individual lines
for i,j in zip(Xn,Xq):
axs[0].plot([0,1],[i,j],'o-',color=(.8,.8,.8),
markersize=12,markerfacecolor=(.8,.8,.8),markeredgecolor='k')
axs[0].set(xlim=[-1,2],xticks=[0,1],ylabel='Scores',xticklabels=[r'X$_N$',r'X$_Q$'],
ylim=[0,100],title=r'$\bf{A}$) Raw data')
## draw the difference scores
axs[1].plot(np.zeros(sampsize),Delta,'ko',markersize=12,markerfacecolor=(.8,.8,.8))
axs[1].plot([-.1,.1],[0,0],'k--',zorder=-1)
axs[1].set(xticks=[0],ylabel='Difference scores',xticklabels=[r'$\Delta$'],
ylim=[-50,50],title=r'$\bf{B}$) Differences')
# export
plt.tight_layout()
plt.savefig('ttest_pairedTtest.png')
plt.show()
# test it!
ttest = stats.ttest_1samp(Delta,0)
# print the results
print(f't({ttest.df})={ttest.statistic:.3f}, p<{ttest.pvalue:.3f}')
# btw, data are consistent with a normal distribution
print(f'Xn Shapiro p-value = {stats.shapiro(Xn).pvalue:.2f}')
print(f'Xq Shapiro p-value = {stats.shapiro(Xq).pvalue:.2f}')
print(f'Xy Shapiro p-value = {stats.shapiro(Delta).pvalue:.2f}')# Figure 11.8: Paired-samples t-test ----
# the data
Xn = c( 60, 52, 90, 20, 33, 95, 18, 47, 78, 65 )
Xq = c( 65, 60, 84, 23, 37, 95, 17, 53, 88, 66 )
sampsize = length(Xn)
# their difference
Delta = Xq-Xn
# Visualize
data = tibble(
data_index=1:sampsize,
Xn = Xn,
Xq = Xq,
Delta = Delta
)
# from wide to long:
data = pivot_longer(
data,
cols = c("Xn", "Xq", "Delta")
)
p11.8_A = ggplot(filter(data, name != "Delta"),
aes(x = factor(name), y=value, group = data_index)) +
geom_line(
color="gray"
) +
geom_point(
shape=21,
size=5,
fill="gray"
) +
ylim(c(0, 100)) +
myTheme +
theme(
axis.text = element_text(color="black"),
axis.text.x = element_text(vjust=-2, hjust = .5),
) +
labs(title=bquote(bold("A)")~"Raw data"),
y="Scores", x = "")
p11.8_B = ggplot(data=filter(data, name == "Delta"),
aes(x = factor(name), y = value)) +
geom_hline(yintercept = 0, linetype="dashed") +
geom_point(
size=5,
shape=21,
fill="gray"
) +
ylim(c(-50, 50)) +
scale_x_discrete(labels= c(bquote(Delta))) +
myTheme +
theme(
axis.text = element_text(color="black"),
axis.text.x = element_text(vjust=-2, hjust = .5),
) +
labs(y = "Difference scores",
x = "",
title=bquote(bold("B)")~"Differences"))
p11.8 = p11.8_A + p11.8_B
p11.8# saving:
ggsave("ttest_pairedTtest.png", p11.8, width = 8, height = 4)
# test it!
ttest = t.test(Delta, mu=0)
# print the results:
print(sprintf("t(%d) = %.3f, p<%.3f", ttest$parameter, ttest$statistic, ttest$p.value))[1] "t(9) = 2.023, p<0.074"
# btw, data are consistent with a normal distribution
print(sprintf("Xn Shapiro p-value = %.2f", shapiro.test(Xn)$p.value))[1] "Xn Shapiro p-value = 0.68"
print(sprintf("Xq Shapiro p-value = %.2f", shapiro.test(Xq)$p.value))[1] "Xq Shapiro p-value = 0.63"
print(sprintf("Xy Shapiro p-value = %.2f", shapiro.test(Delta)$p.value))[1] "Xy Shapiro p-value = 0.98"
# generate data
data1 = stats.exponnorm.rvs(3,size=50)
data2 = stats.gumbel_r.rvs(size=42)
# compute their histograms
yy1,xx1 = np.histogram(data1,bins='fd')
xx1 = (xx1[1:]+xx1[:-1])/2
yy2,xx2 = np.histogram(data2,bins='fd')
xx2 = (xx2[1:]+xx2[:-1])/2
# show the data!
_,axs = plt.subplots(1,2,figsize=(7,3.5))
## raw data
axs[0].plot(np.random.randn(len(data1))/40,data1,
'ko',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[0].plot(np.random.randn(len(data2))/40+1,data2,
'ks',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[0].set(xlim=[-.5,1.5],xticks=[0,1],xticklabels=[r'$X_1$',r'$X_2$'],
title=r'$\bf{A}$) Data')
## histograms
axs[1].plot(xx1,yy1,'ko--',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6,linewidth=2,label=r'$X_1$')
axs[1].plot(xx2,yy2,'ks-',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6,linewidth=2,label=r'$X_2$')
axs[1].set(xlabel='Data value',ylabel='Count',title=r'$\bf{B}$) Distributions')
axs[1].legend()
plt.tight_layout()
plt.savefig('ttest_indTtest.png')
plt.show()
# doubling rubric
s1 = np.std(data1,ddof=1)
s2 = np.std(data2,ddof=1)
# report
print(f'Standard deviations are {s1:.2f} and {s2:.2f}')
print(f'Ratio of max:min stdevs is {np.max([s1,s2])/np.min([s1,s2]):.2f}')
# Levene's test
lres = stats.levene(data1,data2)
print('')
print(f"Levene's test for homogeneity of variance: W={lres.statistic:.2f}, p={lres.pvalue:.3f}")
## tests for normal distribution
# omnibus test
Otest1 = stats.normaltest(data1)
Otest2 = stats.normaltest(data2)
print(f'Omnibus test in X1 (H0=normal): p={Otest1.pvalue:.3f}')
print(f'Omnibus test in X2 (H0=normal): p={Otest2.pvalue:.3f}')
print('')
# Shapiro's test
Stest1 = stats.shapiro(data1)
Stest2 = stats.shapiro(data2)
print(f'Shapiro test in X1 (H0=normal): p={Stest1.pvalue:.3f}')
print(f'Shapiro test in X2 (H0=normal): p={Stest2.pvalue:.3f}')
# now for the t-test
tres = stats.ttest_ind(data1,data2,equal_var=False)
print(f't={tres.statistic:.2f}, p={tres.pvalue:.3f}')
# FYI, here's the result assuming equal variance (see also Exercise 9)
tres = stats.ttest_ind(data1,data2,equal_var=True)
print(f't={tres.statistic:.2f}, p={tres.pvalue:.3f}')# Figure 11.9: Example of 2-sample ttest ----
# generate data
library(emg) # for exponentially modified gaussian distribution
library(extraDistr) # for gaumbler distribution
library(car) # for levene test
# Exponentially modified normal:
x1 = remg(50, lambda = 1/3) # lambda = 1/K
# Gumbel distribution
x2 = extraDistr::rgumbel(n = 42)
# panel A:
data1_A = tibble(
data_index = "x1",
data_value = x1
)
data2_A = tibble(
data_index="x2",
data_value=x2
)
data_A = bind_rows(data1_A, data2_A)
p11.9_A = ggplot(data = data_A,
aes(x=factor(data_index), y=data_value,
group=data_index,
shape=data_index,
fill=data_index)) +
geom_point(
size=5,
alpha=.8
) +
scale_x_discrete(
labels= c(bquote(X[1]), bquote(X[2]))
) +
scale_fill_manual(
values=c("x1" = "gray",
"x2" = "gray"
)
) +
scale_shape_manual(
values = c("x1" = 21,
"x2" = 22)
) +
myTheme +
theme(legend.position = "none") +
labs(x="", y="",
title=bquote(bold("A)")~"Data"))
# panel B:
nbreaks1_B = nclass.FD(x1)
data1_B = hist(x1, breaks=nbreaks1_B, plot=F)
data1_B = tibble(data_index=data1_B$mids,
data_value=data1_B$counts,
group="x1")
nbreaks2_B = nclass.FD(x2)
data2_B = hist(x2, breaks=nbreaks2_B, plot=F)
data2_B = tibble(data_index=data2_B$mids,
data_value=data2_B$counts,
group="x2")
data_B = bind_rows(data1_B, data2_B)
p11.9_B = ggplot(
data=data_B,
aes(x = data_index, y = data_value, group=group,
color=group, shape=group, fill=group, linetype=group)
) +
geom_line(
linewidth=1
) +
geom_point(
size=5,
alpha=.8,
stroke=1
) +
scale_fill_manual(
values=c("x1" = "gray",
"x2" = "gray"),
labels=c("x1" = bquote(X[1]),
"x2" = bquote(X[2]))
) +
scale_linetype_manual(
values = c("x1" = "dashed",
"x2" = "solid"),
labels=c("x1" = bquote(X[1]),
"x2" = bquote(X[2]))
) +
scale_color_manual(
values=c("x1" = "#737373",
"x2" = "#737373"),
labels=c("x1" = bquote(X[1]),
"x2" = bquote(X[2]))
) +
scale_shape_manual(
values = c("x1" = 21,
"x2" = 22),
labels=c("x1" = bquote(X[1]),
"x2" = bquote(X[2]))
) +
myTheme +
theme(legend.position = c(.8, .8)) +
labs(y = "Count", x = "Data value",
title = bquote(bold("B)")~"Distributions"))
p11.9 = p11.9_A + p11.9_B
p11.9ggsave("ttest_indTtest.png", p11.9, width=10, height = 3.5)
# doubling rubric
s1 = sd(x1)
s2 = sd(x2)
# report
print(sprintf("Standard deviations are %.2f and %.2f", s1, s2))[1] "Standard deviations are 3.60 and 1.33"
print(sprintf("Ratio of max:min stdevs is %.2f", max(s1, s2)/min(s1, s2)))[1] "Ratio of max:min stdevs is 2.70"
# Levene's test
library(car)
data_levene_ = bind_rows(
tibble(group="x1", data=x1),
tibble(group="x2", data=x2),
)
lres = leveneTest(data~group, data_levene_)
print(sprintf("Levene's test for homogeneity of variance: W=%.2f, p=%.3f",
lres$`F value`[1], lres$`Pr(>F)`[1]))[1] "Levene's test for homogeneity of variance: W=11.91, p=0.001"
## tests for normal distribution
# omnibus test
Otest1 = nortest::pearson.test(x1)
Otest2 = nortest::pearson.test(x2)
print(sprintf("Omnibus test in X1 (H0 = normal): p=%.3f", Otest1$p.value))[1] "Omnibus test in X1 (H0 = normal): p=0.034"
print(sprintf("Omnibus test in X2 (H0 = normal): p=%.3f", Otest2$p.value))[1] "Omnibus test in X2 (H0 = normal): p=0.062"
# Shapiro's test
Stest1 = shapiro.test(x1)
Stest2 = shapiro.test(x2)
print(sprintf("Shapiro test in X1 (H0 = normal): p=%.3f", Stest1$p.value))[1] "Shapiro test in X1 (H0 = normal): p=0.000"
print(sprintf("Shapiro test in X2 (H0 = normal): p=%.3f", Stest2$p.value))[1] "Shapiro test in X2 (H0 = normal): p=0.018"
# now for the t-test
tres = t.test(x1, x2, var.equal = F)
print(sprintf("t=%.2f, p=%.3f", tres$statistic, tres$p.value))[1] "t=5.40, p=0.000"
# FYI, here's the result assuming equal variance (see also Exercise 9)
tres = tres = t.test(x1, x2, var.equal = T)
print(sprintf("t=%.2f, p=%.3f", tres$statistic, tres$p.value))[1] "t=5.05, p=0.000"
# the data
data = np.random.randn(100)**2
h0 = 1
# show the data!
_,axs = plt.subplots(1,2,figsize=(7,3.5))
## raw data
axs[0].plot(data,'ko',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[0].axhline(h0,linestyle='--',color=(.4,.4,.4),linewidth=2)
axs[0].axhline(np.median(data),color='k',linewidth=3)
axs[0].set(xlabel='Data index',ylabel='Data value',title=r'$\bf{A}$) Data')
## histogram
axs[1].hist(data,bins='fd',facecolor=(.9,.9,.9),edgecolor='k',label='Data')
axs[1].axvline(h0,linestyle='--',color=(.4,.4,.4),linewidth=2,label=r'H$_0$')
axs[1].axvline(np.median(data),color='k',linewidth=3,label='Median')
axs[1].set(xlabel='Data value',ylabel='Count',title=r'$\bf{B}$) Distribution')
axs[1].legend()
plt.tight_layout()
plt.savefig('ttest_ranktest.png')
plt.show()
# the test!
wtest = stats.wilcoxon(data-h0,method='approx')
# and print the results
print(f'Wilcoxon test: z={wtest.zstatistic:.2f}, p={wtest.pvalue:.3f}')# the data
h0 = 1
x = rnorm(100)^2
# panel A:
data_A = tibble(
data_index = 1:length(x),
data_value = x
)
p11.10_A = ggplot(
data=data_A,
aes(x = data_index,
y = data_value)
) +
geom_point(
alpha=.6,
size=5,
shape=21,
fill="gray"
) +
geom_hline(
yintercept = h0,
linewidth=1,
linetype="dashed",
color="#737373"
) +
geom_hline(
yintercept = median(data_A$data_value),
linewidth=1
) +
myTheme +
labs(x = "Data index", y="Data value",
title = bquote(bold("A)")~"Data"))
# panel B:
nbreaks_B = nclass.FD(x)
data_B = hist(x, breaks=nbreaks_B, plot=F)
data_B = tibble(
data_index = data_B$mids,
data_value = data_B$counts
)
p11.10_B = ggplot(
data = data_B,
aes(x = data_index, y = data_value)
) +
geom_col(
aes(fill="Data"),
color="black",
) +
geom_vline(
aes(color="H0", xintercept=h0, linetype="H0"),
linewidth=1,
key_glyph = "path"
) +
geom_vline(
aes(xintercept = median(data_A$data_value),
color="Median", linetype="Median"),
linewidth=1,
key_glyph = "path"
) +
scale_fill_manual(
name = NULL,
values = c("Data" = "gray")
) +
scale_color_manual(
name = NULL,
values = c("H0"="#737373",
"Median"='black'),
) +
scale_linetype_manual(
name = NULL,
values = c("H0" = "dashed",
"Median" = "solid")
) +
myTheme +
theme(
legend.position = c(.8, .8),
legend.spacing.x = unit(.3, 'cm'),
legend.background = element_rect(fill=NA),
legend.spacing = unit(-15, "pt")
) +
labs(color="", fill="",
y="Count", x="Data value", linetype="",
title=bquote(bold("B)")~"Distribution"))
p11.10 = p11.10_A + p11.10_B
p11.10ggsave("ttest_ranktest.png", p11.10, width=7, height=3.35)
# the test!
wtest = wilcox.test(
x - h0,
exact = F, # no exact calculation for the p-value
correct = F, # not applying the continuity correction
alternative = "two.sided"
)
# and print the results
# NOTE: R does not provide the zstatistic. Displaying the Wilcoxon statistics:
print(sprintf("Wilcoxon test: %.2f, p=%.3f", wtest$statistic, wtest$p.value))[1] "Wilcoxon test: 1660.00, p=0.003"
# create the data, shifted by H0=1
_,axs = plt.subplots(2,1,figsize=(3,6))
for i in range(2):
# create and shift data
d = np.random.randn(30)
d += i*2-1
# Wilcoxon z-score
z = stats.wilcoxon(d,method='approx',alternative='two-sided').zstatistic
# draw the figure
axs[i].plot(d,range(len(d)),'ko',markersize=8)
axs[i].axvline(0,zorder=-1,color='gray')
axs[i].set(xlabel='Data values',ylabel='Data index',yticks=[],xlim=[-3,3])
axs[i].set_title(f'Wilcoxon z={z:.2f}',loc='center')
plt.tight_layout()
plt.savefig('ttest_wilcoxonSign.png')
plt.show()Note that R does not provide the z-scored statistic of the Wilcoxon test. So, I choose to display the original Wilcoxon value
# Figure 11.11 ----
# panel A:
# create and shift data
dataA = tibble(
data_index=1:30,
data_value=rnorm(30) - 1,
)
zA = wilcox.test(
dataA$data_value,
exact = F,
correct = F,
alternative = "two.sided"
)$statistic
p11.11_A = ggplot(
data=dataA,
aes(x=data_value, y=data_index)
) +
geom_point(
size = 4,
shape=21,
fill="black"
) +
geom_vline(
xintercept = 0,
linewidth=1,
color="gray"
) +
myTheme +
labs(y="Data index", x = "Data values",
title= glue("Wilcoxon z={zA}"))
# panel B:
# create and shift data
dataB = tibble(
data_index=1:30,
data_value=rnorm(30) + 1,
)
zB = wilcox.test(
dataB$data_value,
exact = F,
correct = F,
alternative = "two.sided"
)$statistic
p11.11_B = ggplot(
data=dataB,
aes(x=data_value, y=data_index)
) +
geom_point(
size = 4,
shape=21,
fill="black"
) +
geom_vline(
xintercept = 0,
linewidth=1,
color="gray"
) +
myTheme +
labs(y="Data index", x = "Data values",
title= glue("Wilcoxon z={zB}"))
# final plot
p11.11 = p11.11_A / p11.11_B
p11.11# saving:
ggsave("ttest_wilcoxonSign.png", p11.11, width=4, height=6)# same data as we used for the independent-samples t-test
data1 = stats.exponnorm.rvs(3,size=50)
data2 = stats.gumbel_r.rvs(size=42)
# MW-U test
mwu = stats.mannwhitneyu(data1,data2)
print(f'U = {mwu.statistic:.0f}, p = {mwu.pvalue:.3f}')
# parametric t-test (gives the same statistical conclusion as the MWU)
tres = stats.ttest_ind(data1,data2,equal_var=False)
print(f't = {tres.statistic:.2f}, p = {tres.pvalue:.3f}')# Mann-Whitney U test ----
# same data as we used fot the independent-samples t test
# Exponentially modified normal:
x1 = remg(50, lambda = 1/3) # lambda = 1/K
# Gumbel distribution
x2 = extraDistr::rgumbel(n = 42)
# MW-U test
mwu = wilcox.test(x1, x2)
print(sprintf("U=%.0f, p=%.3f", mwu$statistic, mwu$p.value))[1] "U=1620, p=0.000"
# parametric t-tet (dives the same statistical conclusion as the MWU)
tres = t.test(x1, x2, var.equal = F)
print(sprintf("t=%.0f, p=%.3f", tres$statistic, tres$p.value))[1] "t=4, p=0.000"
# parameters
N = 50
h0 = -np.pi/2
# create the dataset
X = stats.laplace_asymmetric.rvs(2,size=N)
dataMean = np.mean(X)
# visualize the data
_,axs = plt.subplots(1,2,figsize=(9,3))
axs[0].plot(X,'kp',markersize=8,markerfacecolor=(.9,.9,.9),label='Data')
axs[0].plot([0,N],[h0,h0],'k--',zorder=-10,linewidth=3,label=r'H$_0$ value')
axs[0].plot([0,N],[dataMean,dataMean],'k:',linewidth=3,label='Emp. mean')
axs[0].set(xlabel='Data index',ylabel='Data value')
axs[0].set_title(r'$\bf{A}$) Raw data')
axs[1].hist(X,bins='fd',color=(.9,.9,.9),edgecolor='k')
axs[1].axvline(h0,linestyle='--',color='k',linewidth=3,label=r'H$_0$ value')
axs[1].axvline(dataMean,linestyle=':',color='k',linewidth=3,label=r'Emp. mean')
axs[1].set(xlabel='Data value',ylabel='Count')
axs[1].set_title(r'$\bf{B}$) Histogram and means')
axs[1].legend(bbox_to_anchor=[1,.9])
plt.tight_layout()
plt.savefig('ttest_ex1.png')
plt.show()
# now for the t-tests
## manual calculation
t_num = dataMean - h0
t_den = np.std(X,ddof=1) / np.sqrt(N)
tval = t_num / t_den
pval = 1-stats.t.cdf( np.abs(tval) ,N-1)
pval *= 2 # double it for 2-tailed test
## using scipy.stats
r = stats.ttest_1samp(X,h0)
t = r.statistic
df = r.df
p = r.pvalue
# print both results
print(f'Manual ttest: t({N-1})={tval:.3f}, p={pval:.3f}')
print(f'Scipy ttest: t({df})={t:.3f}, p={p:.3f}')# Exercise 1 ----
# parameters
N = 50
h0 = -pi/2
# create the dataset:
X = LaplacesDemon::ralaplace(
n = N,
scale = sqrt(2),
kappa = 2
)
dataMean = mean(X)
dataMean[1] -1.262486
data_ = tibble(
data_index = 1:N,
data_value = X
)
pE1_A = ggplot(data = data_, aes(x=data_index,y=data_value)) +
geom_point(
shape=22,
size=3,
fill = "gray",
alpha=.8
) +
geom_hline(
yintercept = h0,
linetype="dashed",
color="black",
linewidth=1
) +
geom_hline(
yintercept = dataMean,
linetype="dotted",
color="black",
linewidth=1
) +
myTheme +
labs(x = "Data index", y = "Data Value",
title=bquote(bold("A)")~"Raw Data"))
# panel B:
data_B = hist(X, breaks = nclass.FD(X), plot = F)
data_B = tibble(
data_index = data_B$mids,
data_value = data_B$counts
)
pE1_B = ggplot(
data = data_B,
aes(x = data_index, y = data_value)
) +
geom_col(
fill="gray",
color="black",
) +
geom_vline(
aes(color="H0 value", xintercept=h0, linetype="H0 value"),
linewidth=1,
key_glyph = "path"
) +
geom_vline(
aes(xintercept = dataMean,
color="Emp. mean", linetype="Emp. mean"),
linewidth=1,
key_glyph = "path"
) +
scale_color_manual(
name = NULL,
values = c("H0 value"="#737373",
"Emp. mean"='black'),
) +
scale_linetype_manual(
name = NULL,
values = c("H0 value" = "dashed",
"Emp. mean" = "dotted")
) +
myTheme +
theme(
legend.position = "right",
legend.spacing.x = unit(.3, 'cm'),
legend.background = element_rect(fill=NA),
legend.spacing = unit(-15, "pt"),
) +
labs(color="", fill="",
y="Count", x="Data value", linetype="",
title=bquote(bold("B)")~"Histogram and means"))
# final plot
pE1 = pE1_A + pE1_B
pE1# saving
ggsave("ttest_ex1.png", pE1, width=15, height = 5)
# now for the test
## manual calculation
t_num = dataMean - h0
t_den = sd(X) / sqrt(N)
tval = t_num / t_den
pval = pt(abs(tval), df=N-1, lower.tail = F)
pval = 2*pval # double it for 2-tailed
# using stats
r = t.test(X, mu=h0)
t = r$statistic
df = r$parameter
p = r$p.value
# print both results:
sprintf("manual ttest: t(%d) = %.3f, p=%.3f", N-1, tval, pval)[1] "manual ttest: t(49) = 1.119, p=0.268"
sprintf("Stats ttest: t(%d) = %.3f, p=%.3f", N-1, t, p)[1] "Stats ttest: t(49) = 1.119, p=0.268"
# how often do we get subthreshold results?
nExps = 500
issig = np.zeros(nExps,dtype=bool) # variable type 'bool' for convenience in plotting
means = np.zeros(nExps)
stds = np.zeros(nExps)
# run the experiment
# (Note: For a small extra challenge, you could re-implement this without
# a for-loop using matrix input, after completing the next exercise.)
for i in range(nExps):
# generate data and store the mean/std
X = stats.laplace_asymmetric.rvs(2,size=N)
means[i] = np.mean(X)
stds[i] = np.std(X,ddof=1)
# run the ttest and store if "significant"
r = stats.ttest_1samp(X,h0)
issig[i] = r.pvalue<.05
# print the results
print(f'p<.05 in {np.sum(issig)}/{nExps} times.')
# show the results
_,axs = plt.subplots(1,2,figsize=(7,3))
# means
axs[0].plot(np.random.randn(sum(issig))/40,means[issig],
'ko',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[0].plot(np.random.randn(sum(~issig))/40+1,means[~issig],
'ks',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[0].set(xlim=[-.5,1.5],xticks=[0,1],xticklabels=['p<.05','p>.05'],
title=r'$\bf{A}$) Sample means')
# stds
axs[1].plot(np.random.randn(sum(issig))/40,stds[issig],
'ko',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[1].plot(np.random.randn(sum(~issig))/40+1,stds[~issig],
'ks',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.6)
axs[1].set(xlim=[-.5,1.5],xticks=[0,1],xticklabels=['p<.05','p>.05'],
title=r'$\bf{B}$) Sample stds.')
plt.tight_layout()
plt.savefig('ttest_ex2.png')
plt.show()# Exercise 2
# how often do we get subthreshold results?
nExps = 500
issig = vector("logical", nExps)
means = vector("numeric", nExps)
stds = vector("numeric", nExps)
# run the experiment
# (Note: For a small extra challenge, you could re-implement this
# without a for-loop using matrix input, after completing the next
# exercise.)
for( i in seq(1, nExps, by=1) ){
# generate data and store the mean/std
X = LaplacesDemon::ralaplace(
n = N,
scale = sqrt(2),
kappa = 2
)
means[i] = mean(X)
stds[i] = sd(X)
# run the ttest and store if "significant"
r = t.test(X, mu = h0, alternative = "two.sided")
issig[i] = r$p.value < .05
}
print(glue("p<.05 in {sum(issig)}/{nExps} times."))p<.05 in 41/500 times.
# show the results
pE2_A = ggplot() +
geom_point(
data = tibble(data_index=rnorm(sum(issig))/40,
data_value=means[issig]),
aes(x=data_index, y=data_value),
shape=21,
size=3,
fill="gray",
alpha=.6
) +
geom_point(
data = tibble(data_index=rnorm(sum(!issig))/40 + 1,
data_value=means[!issig]),
aes(x=data_index, y=data_value),
shape=22,
size=3,
fill="gray",
alpha=.6
) +
scale_x_continuous(
breaks = c(0, 1),
labels= c("p<.05", "p>.05")
) +
myTheme +
labs(y="", x = "",
title=bquote(bold("A)")~"Sample Means"))
pE2_B = ggplot() +
geom_point(
data = tibble(data_index=rnorm(sum(issig))/40,
data_value=stds[issig]),
aes(x=data_index, y=data_value),
shape=21,
size=3,
fill="gray",
alpha=.6
) +
geom_point(
data = tibble(data_index=rnorm(sum(!issig))/40 + 1,
data_value=stds[!issig]),
aes(x=data_index, y=data_value),
shape=22,
size=3,
fill="gray",
alpha=.6
) +
scale_x_continuous(
breaks = c(0, 1),
labels= c("p<.05", "p>.05")
) +
myTheme +
labs(y="", x = "",
title=bquote(bold("A)")~"Sample stds."))
pE2 = pE2_A + pE2_B
pE2# saving:
ggsave("ttest_ex2.png", pE2, width=7, height=3)NperSample = 40
MDatasets = 25
# data
X = np.random.normal(loc=1,scale=1,size=(NperSample,MDatasets))
# confirm data size
print('Data size should be sample-size X datasets:')
print(X.shape)
# ttest with matrix input
ttest_matrix = stats.ttest_1samp(X,0)
# ttest in for-loop over each column (each dataset)
ttest_4loop = np.zeros(MDatasets)
for i in range(MDatasets):
ttest_4loop[i] = stats.ttest_1samp(X[:,i],0).statistic
# print the results
print('Matrix | Vector')
print('--------|--------')
for i in range(MDatasets):
print(f'{ttest_matrix.statistic[i]:.4f} | {ttest_4loop[i]:.4f}')# Exercise 3 ----
NperSample = 40
MDatasets = 25
# data
X = matrix(rnorm(NperSample*MDatasets, mean = 1, sd = 1),
nrow=NperSample, ncol = MDatasets)
print("Data size should be sample-size X dataset:")
print(dim(X))
# t.test with matrix input:
ttest_matrix = apply(X, 2, t.test, mu=0)
# t.test in for-loop over each column (each dataset);
ttest_4loop = vector("list", MDatasets)
for (i in seq(1, MDatasets)) {
ttest_4loop[[i]] = t.test(X[, i], mu=0)
}
# print the results:
print("Matrix | Vector")
print("------ | ------")
for (i in seq(1,MDatasets)){
print(
sprintf("%.4f | %.4f",
ttest_matrix[[i]]$statistic,
ttest_4loop[[i]]$statistic)
)
}[1] "Data size should be sample-size X dataset:"
[1] 40 25
[1] "Matrix | Vector"
[1] "------ | ------"
[1] "7.2288 | 7.2288"
[1] "6.1014 | 6.1014"
[1] "6.3915 | 6.3915"
[1] "8.7095 | 8.7095"
[1] "4.7231 | 4.7231"
[1] "5.3852 | 5.3852"
[1] "5.1464 | 5.1464"
[1] "5.6537 | 5.6537"
[1] "9.2032 | 9.2032"
[1] "7.0725 | 7.0725"
[1] "6.3171 | 6.3171"
[1] "5.2755 | 5.2755"
[1] "6.8695 | 6.8695"
[1] "6.6160 | 6.6160"
[1] "4.6552 | 4.6552"
[1] "7.1523 | 7.1523"
[1] "6.1646 | 6.1646"
[1] "5.0920 | 5.0920"
[1] "5.0119 | 5.0119"
[1] "6.6342 | 6.6342"
[1] "4.9054 | 4.9054"
[1] "6.2348 | 6.2348"
[1] "6.7111 | 6.7111"
[1] "5.5348 | 5.5348"
[1] "5.7383 | 5.7383"
# data parameters
N = 40
k = 300
# list of standard deviations
stds = np.linspace(.1,3,k)
# initialize the t/p vectors
t = np.zeros(k)
p = np.zeros(k)
s = np.zeros(k) # this line is for exercise 5
for i in range(len(stds)):
X = np.random.normal(0,stds[i],size=N)
X = X-np.mean(X) + .5 # force mean=.5
ttest = stats.ttest_1samp(X,0)
t[i] = ttest.statistic
p[i] = ttest.pvalue
# get the sample std (used in exercise 5)
s[i] = np.std(X,ddof=1)
# and now the plotting
_,axs = plt.subplots(1,3,figsize=(10,3))
# t's
axs[0].plot(stds,t,'ks',markerfacecolor='w')
axs[0].set(xlabel='Standard deviation',ylabel='t-value',title=r'$\bf{A}$) T-values')
# p's
axs[1].plot(stds,p,'ks',markerfacecolor='w')
axs[1].set(xlabel='Standard deviation',ylabel='p-value',title=r'$\bf{B}$) P-values')
# t and p
axs[2].plot(t,p,'ks',markerfacecolor='w')
axs[2].set(xlabel='T-value',ylabel='p-value',title=r'$\bf{C}$) p by t')
plt.tight_layout()
plt.savefig('ttest_ex4.png')
plt.show()# Exercise 4 ----
# data parameters:
N = 40
k = 300
stds = seq(.1, 3, length=k)
# initiate the t/p vectors:
t = vector("numeric", k)
p = vector("numeric", k)
s = vector("numeric", k)
for (i in seq(1, k)) {
X = rnorm(n = N, mean = 0, sd = stds[[i]])
X = X - mean(X) + .5 # force mean = .5
ttest = t.test(X, mu = 0)
t[[i]] = ttest$statistic
p[[i]] = ttest$p.value
# get the sample std (used in exercise 5)
s[[i]] = sd(X)
}
# and now plotting:
pE4_A = ggplot(
data = tibble(data_index = stds,
data_value = t),
aes(x = data_index, y = data_value)
) +
geom_point(
shape = 22,
color="black",
size=4
) +
myTheme +
labs(y = "t-value",
x = "Standard deviation",
title=bquote(bold("A)")~"T-values"))
pE4_B =
ggplot(
data = tibble(data_index = stds,
data_value = p),
aes(x = data_index,
y = data_value)
) +
geom_point(
shape=22,
color="black",
size=3
) +
myTheme +
labs(y = "p-value",
x = "Standard deviation",
title=bquote(bold("B)")~"P-values"))
# panel C:
pE4_C = ggplot(
data = tibble(data_index=t,
data_value=p),
aes(x=data_index, y=data_value)
) +
geom_point(
shape=22,
color="black",
size=3
) +
myTheme +
labs(y="p-value", x="T-value",
title=bquote(bold("C)")~"p by t"))
# final plot
pE4 = pE4_A + pE4_B + pE4_C
pE4# save
ggsave("ttest_ex4.png", pE4, width=10, height = 3)# No, it doesn't really matter, because even with N=40, the sample standard deviation
# is a fairly accurate estimate of the population standard deviation, certainly for
# this range of standard deviation values.
# You can recreate the figure by replacing variable 'stds' with 's' in the code above,
# and by demonstrating the strong correlation between sample and theoretical standard deviation.
# correlation coefficient (values close to 1 indicate a very strong relationship)
r = np.corrcoef(stds,s)
# plot
plt.plot(stds,s,'ko')
plt.xlabel('Theoretical population standard deviations')
plt.ylabel('Empirical sample standard deviations')
plt.title(f'Correlation: r={r[0,1]:.3f}',loc='center')
plt.show()# Exercise 5 ----
# No, it doesn't really matter, because even with N=40, the sample
# standard deviation is a fairly accurate estimate of the population
# standard deviation, certainly for this range of standard deviation
# values.
# You can recreate the figure by replacing variable 'stds' with 's'
# in the code above, and by demonstrating the strong correlation
# between sample and theoretical standard deviation.
# correlation coefficient (values close to 1 indicate a very strong
# relationship)
r = cor.test(stds,s)
# plot
p5 = ggplot(
data = tibble(data_index = stds,
data_value = s),
aes(x = data_index, y = data_value)
) +
geom_point(
shape = 21,
fill = "black",
size = 3
) +
myTheme +
theme(plot.title = element_text(hjust = .5)) +
labs(y = "Theoretical population standard deviations",
x = "Empirical sample standard deviations",
title=sprintf("Correlation: r=%.3f", r$estimate))
# final plot
p5# saving;
ggsave("ttest_ex5.png", p5, width=10, height=3)nExperiments = 250
meanoffsets = np.linspace(0,.3,51)
samplesizes = np.arange(10,811,step=50)
# initialize
propSig = np.zeros((len(samplesizes),len(meanoffsets)))
# loop over sample sizes
for sidx,ssize in enumerate(samplesizes):
# loop over effect sizes
for eidx,effect in enumerate(meanoffsets):
# generate the data
X = np.random.normal(loc=effect,scale=1.5,size=(ssize,nExperiments))
# run the t-test and store the results
T = stats.ttest_1samp(X,0)
propSig[sidx,eidx] = 100*np.mean( T.pvalue<.05 )
# visualize in a matrix
plt.imshow(propSig,extent=[meanoffsets[0],meanoffsets[-1],samplesizes[0],samplesizes[-1]],
vmin=0,vmax=25,origin='lower',aspect='auto',cmap='gray')
plt.xlabel('Mean offset')
plt.ylabel('Sample size')
cbar = plt.colorbar()
cbar.set_label('Percent tests with p<.05')
plt.tight_layout()
plt.savefig('ttest_ex6.png')
plt.show()# Exercise 6 ----
nExperiments = 250
meanoffsets = seq(0, .3, length = 51)
samplesizes = seq(10, 811, by = 50)
# initialize
propSig = matrix(
data = 0,
nrow = length(samplesizes),
ncol = length(meanoffsets)
)
# loop over sample sizes
for (i in seq_along(samplesizes)) {
# i = 1
ssize = samplesizes[[i]]
for (j in seq_along(meanoffsets)) {
# j=1
effect = meanoffsets[[j]]
# Generate the data:
X = matrix(
data = rnorm(n = ssize*nExperiments, mean = effect, sd = 1.5),
nrow = ssize,
ncol=nExperiments
)
# run the t-test and store the results
T_ = apply(X, 2, t.test, mu=0)
propSig[i,j] = 100*mean(map_dbl(T_, "p.value") < .05)
}
}
# Create a data frame with the values
data_ = as_tibble(propSig)
colnames(data_) = meanoffsets
data_[["SampleSizes"]] = samplesizes
data_ = pivot_longer(data_, -"SampleSizes", names_to = "MeanOffSets")
# Create the ggalot
pE6 = ggplot(data_, aes(x= MeanOffSets, y=SampleSizes, fill=value)) +
# geom_raster() +
geom_tile() +
scale_fill_gradient(
low = "black",
high = "white",
limits = c(0, 25)
) +
myTheme +
scale_x_discrete(
breaks = seq(min(meanoffsets), max(meanoffsets), by=.05)
) +
labs(x = "Mean offset", y = "Sample size",
fill = "Percent tests with p<.05")
pE6# save
ggsave("ttest_ex6.png", pE6, width=8, height=4)Xn = np.array([ 60, 52, 90, 20, 33, 95, 18, 47, 78, 65 ])
Xq = np.array([ 65, 60, 84, 23, 37, 95, 17, 53, 88, 66 ])
sampsize = len(Xn)
# simple subtraction (Y1 in the text)
Ysub = Xq-Xn
# zscore subtraction (Y2 in the text)
Ysbz = stats.zscore(Xq) - stats.zscore(Xn)
# percent change (Y3 in the text)
Ypct = 100*(Xq-Xn) / Xn
# normalized ratio (Y4 in the text)
Ynrt = (Xq-Xn) / (Xq+Xn)
# plot
_,axs = plt.subplots(2,3,figsize=(10,6))
axs[0,0].plot(Ysub,Ysbz,'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[0,0].set(xlabel='Subtraction',ylabel='Z-scored')
axs[0,1].plot(Ysub,Ypct,'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[0,1].set(xlabel='Subtraction',ylabel='Percent change')
axs[0,2].plot(Ysub,Ynrt,'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[0,2].set(xlabel='Subtraction',ylabel='Norm. ratio')
axs[1,0].plot(Ysbz,Ypct,'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[1,0].set(xlabel='Z-scored',ylabel='Percent change')
axs[1,1].plot(Ysbz,Ynrt,'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[1,1].set(xlabel='Z-scored',ylabel='Norm. ratio')
axs[1,2].plot(Ypct,Ynrt,'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[1,2].set(xlabel='Percent change',ylabel='Norm. ratio')
plt.tight_layout()
plt.savefig('ttest_ex7.png')
plt.show()# Exercise 7 ----
Xn = c(60, 52, 90, 20, 33, 95, 18, 47, 78, 65)
Xq = c(65, 60, 84, 23, 37, 95, 17, 53, 88, 66)
sampsize = length(Xn)
# simple subtraction (Y1 in the text)
Ysub = Xq-Xn
# zscore subtraction (Y2 in the text)
Ysbz = scale(Xq) - scale(Xn)
Ysbz = as.vector(Ysbz)
# percent change (Y3 in the text)
Ypct = 100*(Xq-Xn) / Xn
# normalized ratio (Y4 in the text)
Ynrt = (Xq-Xn) / (Xq+Xn)
# plots
pE6_1 = ggplot(
data = tibble(x = Ysub, y = Ysbz),
aes(x = x, y = y)
) +
geom_point(
size=5,
shape=21,
fill="gray"
) +
myTheme +
labs(x="Subtraction", y="Z-scored")
pE6_2 = ggplot(data = tibble(x = Ysub, y = Ypct),
aes(x=x,y=y)) +
geom_point(
size=5,
shape=21,
fill="gray"
) +
myTheme +
labs(x="Subtraction", y="Percent change")
pE6_3 = ggplot(data = tibble(x = Ysub, y = Ynrt),
aes(x=x,y=y)) +
geom_point(
size=5,
shape=21,
fill="gray"
) +
myTheme +
labs(x="Subtraction", y="Norm. ration")
pE6_4 = ggplot(data = tibble(x = Ysbz, y = Ypct),
aes(x=x,y=y)) +
geom_point(
size=5,
shape=21,
fill="gray"
) +
myTheme +
labs(x="Z-scored", y="Percent change")
pE6_5 = ggplot(data = tibble(x = Ysbz, y = Ynrt),
aes(x=x,y=y)) +
geom_point(
size=5,
shape=21,
fill="gray"
) +
myTheme +
labs(x="Z-scored", y="Norm. ratio")
pE6_6 = ggplot(data = tibble(x = Ypct, y = Ynrt),
aes(x=x,y=y)) +
geom_point(
size=5
,
shape=21,
fill="gray"
) +
myTheme +
labs(x="Percent change", y="Norm. ratio")
# final plot:
pE6 = pE6_1 + pE6_2 + pE6_3 + pE6_4 + pE6_5 + pE6_6 + plot_layout(ncol=3)
pE6# saving:
ggsave("ttest_ex7.png", pE6, width=10, height=6)# parameters
mu1 = 1.2 # population mean in dataset 1
mu2 = 1 # population mean in dataset 2
# sample sizes
ns = np.arange(10,201,step=10)
# setup the figure
_,axs = plt.subplots(2,1,figsize=(8,6))
# start the experiment!
for ni,N in enumerate(ns):
# generate the data (100 datasets at a time)
data1 = np.random.normal(loc=mu1,scale=.5,size=(N,100))
data2 = np.random.normal(loc=mu2,scale=.5,size=(N,100))
# run the ttest
ttest = stats.ttest_ind(data1,data2)
t = ttest.statistic;
p = ttest.pvalue;
# plot the t-value, colored by significance
axs[0].plot(np.full(np.sum(p>.05),N),t[p>.05],'ks',markersize=8,markerfacecolor=(.5,.5,.5),alpha=.3)
axs[0].plot(np.full(np.sum(p<.05),N),t[p<.05],'ro',markersize=8,markerfacecolor=(.7,.3,.3))
# plot the p-values
axs[1].plot(np.full(np.sum(p>.05),N),p[p>.05],'ks',markersize=8,markerfacecolor=(.5,.5,.5),alpha=.3)
axs[1].plot(np.full(np.sum(p<.05),N),p[p<.05],'ro',markersize=8,markerfacecolor=(.7,.3,.3))
## rest of the visualization
axs[0].set(xlabel='Sample size (equal groups)',xticks=ns[::2],ylabel='T-test value')
axs[0].set_title(r'$\bf{A}$) T-values')
# adjust the p-values panel
axs[1].set(xlabel='Sample size (equal groups)',xticks=ns[::2],ylabel='P value')
axs[1].set_yscale('log')
axs[1].axhline(.05,linestyle='--',color='r')
axs[1].set_title(r'$\bf{B}$) P-values')
plt.tight_layout()
plt.savefig('ttest_ex8.png')
plt.show()# compute critical t-values for the degrees of freedom
tCrit = stats.t.isf(.05/2,2*ns-2)
# and visualize
plt.figure(figsize=(7,3))
plt.plot(ns,tCrit,'ko',markersize=10,markerfacecolor=(.6,.6,.6))
plt.ylim([1.8,2.2])
plt.xlabel('Degrees of freedom')
plt.ylabel('Critical t-values (2-tailed)')
plt.show()# Exercise 8 ----
# parameters
mu1 = 1.2 # population mean in dataset 1
mu2 = 1 # population mean in dataset 2
# sample sizes
ns = seq(10, 201, by=10)
pE8_A = ggplot()
pE8_B = ggplot()
# start the experiment
for (i in seq_along(ns)){
# i=1
N = ns[[i]]
data1 = matrix(
rnorm(N*100, mean = mu1, sd = .5),
nrow=N, ncol=100
)
data2 = matrix(
rnorm(N*100, mean = mu2, sd = .5),
nrow=N, ncol=100
)
# run the ttest:
ttest = map2(as_tibble(data1), as_tibble(data2), t.test, var.equal=T, paired=F)
t = map_dbl(ttest, "statistic") %>% unname
p = map_dbl(ttest, "p.value") %>% unname
# plot the t-value, colored by significance
pE8_A = pE8_A +
geom_point(
data = tibble(x = rep(N, length=sum(p > .05)),
y = t[p > .05]),
aes(x = x, y = y),
shape = 22,
size=5,
alpha=.3,
fill="gray"
) +
geom_point(
data = tibble(x = rep(N, length=sum(p < .05)),
y = t[p < .05]),
aes(x = x, y = y),
shape = 21,
size=5,
alpha=.3,
fill="red"
) +
myTheme
# # plot the p-values, colored by significance
pE8_B = pE8_B +
geom_point(
data = tibble(x = rep(N, length=sum(p > .05)),
y = p[p > .05]),
aes(x = x, y = y),
shape = 22,
size=5,
alpha=.3,
fill="gray"
) +
geom_point(
data = tibble(x = rep(N, length=sum(p < .05)),
y = p[p < .05]),
aes(x = x, y = y),
shape = 21,
size=5,
alpha=.3,
fill="red"
) +
myTheme
}
# final adjustments:
pE8_A = pE8_A +
scale_x_continuous(breaks = ns) +
labs(title=bquote(bold("A)")~"T-values"),
x = "Sample size (equal groups)", y = "T-test value")
pE8_B = pE8_B +
scale_x_continuous(breaks = ns) +
scale_y_log10() +
geom_hline(
yintercept = 0.05,
linetype="dashed",
color="red",
linewidth=1
) +
labs(title=bquote(bold("B)")~"P-values"),
x = "Sample size (equal groups)", y = "P value")
# final plot:
pE8 = pE8_A + pE8_B + plot_layout(ncol=1)
pE8# saving:
ggsave("ttest_ex8.png", pE8, width = 10, height = 6)# compute critical t-values for the degrees of freedom
tCrit = map_dbl(2*ns-2, ~qt(.05/2, ., lower.tail = F))
# and visualize
ggplot(data = tibble(x = ns, y = tCrit),
aes(x = x, y = y))+
geom_point(
shape=21,
size=5,
fill="gray"
) +
ylim(c(1.8, 2.2)) +
myTheme +
labs(x = 'Degrees of freedom',
y = 'Critical t-values (2-tailed)')# range of standard deviations
stdevs = np.linspace(.01,15,41)
# initialize results matrix
results = np.zeros((3,len(stdevs)))
tCrit = np.zeros(len(stdevs))
# start the experiment!
for si,std in enumerate(stdevs):
# create two groups of data
X1 = np.random.normal(loc=1,scale=1,size=50)
X2 = np.random.normal(loc=1.1,scale=std,size=40)
# levene's test
results[0,si] = np.log( stats.levene(X1,X2).pvalue )
# difference of t-values
same_var = stats.ttest_ind(X1,X2,equal_var=True) # equal variance
diff_var = stats.ttest_ind(X1,X2,equal_var=False) # unequal variance
results[1,si] = same_var.statistic # equal variance
results[2,si] = diff_var.statistic # unequal variance
# compute df for tCrit
s1,s2 = np.var(X1,ddof=1),np.var(X2,ddof=1)
n1,n2 = len(X1),len(X2)
df_num = (s1/n1 + s2/n2)**2
df_den = s1**2/(n1**2*(n1-1)) + s2**2/(n2**2*(n2-1))
tCrit[si] = stats.t.isf(.05/2,df_num/df_den)
# plot
_,axs = plt.subplots(1,2,figsize=(10,4))
# levene's test results
axs[0].plot(stdevs,results[0,:],'ks',markersize=10,markerfacecolor='gray')
axs[0].axhline(np.log(.05),color=(.6,.6,.6),linestyle='--',zorder=-1)
axs[0].text(np.max(stdevs),np.log(.1),'p=.05',ha='right',color=(.6,.6,.6))
axs[0].set(xlabel=r'Standard deviation of $X_2$',ylabel='log(p)',title=r"$\bf{A}$) Levene's P-values")
# t-tests
axs[1].plot(stdevs,results[1,:],'ks',markersize=8,markerfacecolor=(.4,.4,.4),label='Equal var.')
axs[1].plot(stdevs,results[2,:],'ko',markersize=8,markerfacecolor=(.8,.8,.8),label='Unequal var.')
axs[1].plot(stdevs,tCrit,'--',color=(.6,.6,.6),zorder=-1,label='p=.05')
axs[1].plot(stdevs,-tCrit,'--',color=(.6,.6,.6),zorder=-1)
axs[1].set(xlabel=r'Standard deviation of $X_2$',ylabel='T-value',title=r"$\bf{B}$) T-test t-values")
axs[1].legend(fontsize=10,bbox_to_anchor=[.8,1])
plt.tight_layout()
plt.savefig('ttest_ex9.png')
plt.show()# Not in the instructions, but I think it's also interesting to
# plot the ratio of t-values as a function of standard deviation
# values >1 indicate a larger t-value for equal compared to unequal variance formula
plt.plot(stdevs,results[1,:]/results[2,:],'ko',markersize=8)
plt.axhline(1,linestyle='--',color='gray',zorder=-1)
plt.ylim([.5,1.5])
plt.ylabel('Equal to Unequal variance t ratio')
plt.show()# Exercise 9 ----
# range of standard deviations
stdevs = seq(.01, 15, length=41)
# initialize results matrix
results = matrix(0, nrow = 3, ncol=length(stdevs))
tCrit = rep(0, length(stdevs))
# start the experiment!
for (i in seq_along(stdevs) ) {
# i=1
std = stdevs[[i]]
# create two groups of data
X1 = rnorm(mean=1, sd=1, n=50)
X2 = rnorm(mean=1.1, sd=std, n=40)
# levene's test
levene_test = car::leveneTest(
x ~ group,
bind_rows(tibble(x= X1, group = "sample 1"),
tibble(x= X2, group = "sample 2"))
)
results[1, i] = log(levene_test$`Pr(>F)`[[1]])
# difference of t-values
same_var = t.test(X1, X2, var.equal = T) # equal variance
diff_var = t.test(X1, X2, var.equal = F) # unequal variance
results[2, i] = same_var$statistic # equal variance
results[3, i] = diff_var$statistic # unequal variance
# compute df for tCrit
s1 = var(X1)
s2 = var(X2)
n1 = length(X1)
n2 = length(X2)
df_num = (s1/n1 + s2/n2)^2
df_den = s1^2/(n1^2*(n1-1)) + s2^2/(n2^2*(n2-1))
tCrit[[i]] = qt(.05/2, df =df_num/df_den, lower.tail = F)
}
# plot
# levene's test results
pE9_A = ggplot(data = tibble(x = stdevs,
y = results[1,]),
aes(x=x, y=y)) +
geom_point(
shape=22,
fill="#525252",
size=5
) +
geom_hline(
yintercept = log(.05),
linetype="dashed",
color="gray",
linewidth=1
) +
annotate(
"text",
x = max(stdevs),
y = log(.1),
label = "p = .05",
color="gray",
size=4
) +
myTheme +
labs(title=bquote(bold("A)")~"Levene's P-values"),
x = bquote("Standard deviation of"~X[2]),
y="log(p)")
# t-tests
pE9_B = ggplot( ) +
geom_point(
data = tibble(x = stdevs,
y = results[2, ]),
aes(x=x, y = y,
shape = "Equal var.",
fill = "Equal var."),
size=5
) +
geom_point(
data = tibble(x = stdevs,
y = results[3, ]),
aes(x=x, y = y,
shape = "Unequal var.",
fill = "Unequal var."),
size=5
) +
geom_line(
data = tibble(x = stdevs,
y = tCrit),
aes(x = x, y = y,
color = "p == .05"),
linetype="dashed",
linewidth=1
) +
geom_line(
data = tibble(x = stdevs,
y = -tCrit),
aes(x = x, y = y),
linetype="dashed",
color="gray",
linewidth=1
) +
scale_fill_manual(
values = c("Equal var." = "#525252",
"Unequal var." = "#bdbdbd")
) +
scale_shape_manual(
values = c("Equal var." = 22,
"Unequal var." = 21)
) +
scale_color_manual(
values=c("p == .05" = "gray")
) +
myTheme +
theme(
legend.position = "right",
legend.spacing.x = unit(.3, 'cm'),
legend.background = element_rect(fill=NA),
legend.spacing = unit(-15, "pt")
) +
labs(fill="", shape="",
title=bquote(bold("B)")~"T-test values"),
x = bquote("Standard deviation of"~X[2]),
y="T-value")
# final plot
pE9 = pE9_A + pE9_B
pE9# saving:
ggsave("ttest_ex9.png", pE9, width=12, height=5)# Not in the instructions, but I think it's also interesting to
# plot the ratio of t-values as a function of standard deviation
# values >1 indicate a larger t-value for equal compared to unequal variance formula
ggplot(data = tibble(x=stdevs,
y = results[2,]/results[3,]),
aes(x = x, y = y))+
geom_point(
shape=21,
fill="black",
size=5
) +
geom_hline(
yintercept = 1,
linetype="dashed",
color="gray",
linewidth=1
) +
ylim(c(.5, 1.5)) +
myTheme +
labs(y="Equal to Unequal variance t ratio",
x="")# generate the data
sigmas = np.linspace(.1,1.2,20)
# null hypothesis value
h0 = .5
# initialize the results matrices
tvals = np.zeros((2,len(sigmas)))
cents = np.zeros((2,len(sigmas)))
_,axs = plt.subplots(1,3,figsize=(11,3))
# compute and store all moments in a matrix
for i,s in enumerate(sigmas):
# generate mean-centered data
X = np.exp(np.random.randn(100)*s)
X -= np.mean(X)
# compute and store the descriptives
cents[0,i] = np.mean(X) - h0
cents[1,i] = np.median(X) - h0
# draw the histogram
if i%3==0:
mc = len(sigmas)+2
y,x = np.histogram(X,bins='fd')
axs[0].plot((x[:-1]+x[1:])/2,y,color=(i/mc,i/mc,i/mc),linewidth=2)
axs[0].axvline(np.median(X),color=(i/mc,i/mc,i/mc),linestyle='--',linewidth=.8)
# parametric t-test
tvals[0,i] = stats.ttest_1samp(X,h0).statistic
# Wilcoxon test
tvals[1,i] = stats.wilcoxon(X-h0,method='approx').zstatistic
## plot
axs[0].set(xlim=[-1.5,4],xlabel='Data value',ylabel='Count')
axs[0].set_title(r'$\bf{A}$) Distributions')
axs[1].plot(sigmas,cents[0,:],'ks',markersize=8,markerfacecolor=(.3,.3,.3),label=r'$\Delta$ to mean')
axs[1].plot(sigmas,cents[1,:],'ko',markersize=8,markerfacecolor=(.8,.8,.8),label=r'$\Delta$ to median')
axs[1].legend(fontsize=12)
axs[1].set(xlabel=r'$\sigma$ parameter for exp(X$\sigma$)',ylabel='Mean or median dist.')
axs[1].set_title(r'$\bf{B}$) Distance to H$_0$')
axs[2].plot(sigmas,tvals[0,:],'ks',markersize=8,markerfacecolor=(.3,.3,.3),label='Parametric t')
axs[2].plot(sigmas,tvals[1,:],'ko',markersize=8,markerfacecolor=(.8,.8,.8),label='Wilcoxon z')
axs[2].legend(fontsize=12)
axs[2].set(xlabel=r'$\sigma$ parameter for exp(X$\sigma$)',ylabel='z or t value')
axs[2].set_title(r'$\bf{C}$) Test stat. values')
plt.tight_layout()
plt.savefig('ttest_ex10a.png')
plt.show()# plot showing relationship between central tendency distances and test statistic values
_,axs = plt.subplots(1,2,figsize=(8,3.5))
axs[0].plot(cents[0,:],tvals[0,:],'ko',markersize=12,markerfacecolor=(.6,.6,.6))
axs[0].set(xlabel='Distance: mean to .5',ylabel='T-value',xlim=[-.6,-.4],
title=r'$\bf{A}$) One-sample t-test')
axs[1].plot(cents[1,:],tvals[1,:],'ko',markersize=12,markerfacecolor=(.6,.6,.6))
axs[1].set(xlabel='Distance: median to .5',ylabel='Z-value',
title=r'$\bf{B}$) Wilcoxon test')
plt.tight_layout()
plt.savefig('ttest_ex10b.png')
plt.show()# Exercise 10 ----
# generate the data
sigmas = seq(.1, 1.2, length=20)
# null hypothesis value
h0 = .5
# initialize the results matrices
tvals = matrix(0, nrow = 2, ncol = length(sigmas))
cents = matrix(0, nrow=2, ncol = length(sigmas))
pE10_A = ggplot()
# compute and store all moments in a matrix
for (i in seq_along(sigmas) ) {
# i = 1
s = sigmas[[i]]
# generate mean-centered data
X = exp(rnorm(100)*s)
X = X - mean(X)
# compute and store the descriptives
cents[1, i] = mean(X) - h0
cents[2, i] = median(X) - h0
# draw the histogram
if (i %% 3 == 0) {
mc = length(sigmas)+2
n_breaks_ = nclass.FD(X)
# pick a random color
color_ = rgb(i/mc, i/mc, i/mc)
X_hist = hist(X, breaks = n_breaks_, plot = F)
pE10_A = pE10_A +
geom_line(
data = tibble(x = X_hist$mids,
y = X_hist$counts),
aes(x = x, y = y),
linewidth=1,
color = color_
) +
geom_vline(
xintercept = median(X),
linetype="dashed",
linewidth=.8,
color = color_
)
}
# parametric t-test
tvals[1,i] = t.test(X,mu = h0)$statistic
# Wilcoxon test
tvals[2,i] = wilcox.test(
X - h0,
exact = F, # no exact calculation for the p-value
correct = F, # not applying the continuity correction
alternative = "two.sided"
)$statistic
}
pE10_A = pE10_A +
xlim(c(-1.5, 4)) +
myTheme +
labs(x="Data value", y="Count",
title=bquote(bold("A)")~"Distributions"))
# panel B:
pE10_B = ggplot() +
geom_point(
data = tibble(
x = sigmas,
y = cents[1,]
),
aes(x = x, y = y,
fill = "delta to mean",
shape = "delta to mean"),
size = 5
) +
geom_point(
data = tibble(
x = sigmas,
y = cents[2,]
),
aes(x = x, y = y,
fill = "delta to median",
shape = "delta to median"),
size=5
) +
scale_fill_manual(
labels = c(bquote(Delta~"to mean"),
bquote(Delta~"to median")),
values = c("delta to mean" = rgb(.3, .3, .3),
"delta to median" = rgb(.8, .8, .8))
) +
scale_shape_manual(
labels = c(bquote(Delta~"to mean"),
bquote(Delta~"to median")),
values = c("delta to mean" = 22,
"delta to median" = 21)
) +
myTheme +
theme(legend.position = c(.3, .5)) +
labs(x = bquote(sigma~"parameter for"~exp(X~sigma)),
y = "Mean of median dist.",
title = bquote(bold("B)")~"Distance to"~H[0]),
fill = "",
shape = "")
# panel C:
pE10_C = ggplot() +
geom_point(
data = tibble(
x = sigmas,
y = tvals[1,]
),
aes(x = x, y = y,
fill = "Parametric t",
shape = "Parametric t"),
size = 5
) +
geom_point(
data = tibble(
x = sigmas,
y = tvals[2,]
),
aes(x = x, y = y,
fill = "Wilcoxon z",
shape = "Wilcoxon z"),
size=5
) +
scale_fill_manual(
values = c("Parametric t" = rgb(.3, .3, .3),
"Wilcoxon z" = rgb(.8, .8, .8))
) +
scale_shape_manual(
values = c("Parametric t" = 22,
"Wilcoxon z" = 21)
) +
myTheme +
theme(legend.position = c(.3, .5)) +
labs(x = bquote(sigma~"parameter for"~exp(X~sigma)),
y = "statistic",
title = bquote(bold("C)")~"Test stat. values"),
fill = "",
shape = "")
## final plot
pE10 = pE10_A + pE10_B + pE10_C
pE10# saving:
ggsave('ttest_ex10a.png', pE10, width=12, height=5)# plot showing relationship between central tendency distances and test statistic values
pE102_A = ggplot() +
geom_point(
data = tibble(
x = cents[1,],
y = tvals[1,]
),
aes(x = x, y = y),
fill = rgb(.6, .6, .6),
size=5
) +
xlim(c(-.6, -.4)) +
myTheme +
labs(x = "Disance: mean to .5",
y = "T-value",
title = bquote(bold("A)")~"One-sample t-test"))
pE102_B = ggplot() +
geom_point(
data = tibble(
x = cents[2,],
y = tvals[2,]
),
aes(x = x, y = y),
fill = rgb(.6, .6, .6),
size=5
) +
myTheme +
labs(x = "Disance: mean to .5",
y = "Stat-value",
title = bquote(bold("B)")~"Wilcoxon test"))
# final plot
pE102 = pE102_A + pE102_B
pE102# saving
ggsave('ttest_ex10b.png', pE102, width=8, height = 4)# params
meanoffsets = np.linspace(0,2,71)
samplesizes = np.arange(10,811,step=50)
# initialize
pvals = np.zeros((len(samplesizes),len(meanoffsets)))
cohend = np.zeros((len(samplesizes),len(meanoffsets)))
r2 = np.zeros((len(samplesizes),len(meanoffsets)))
# loop over sample sizes
for sidx,ssize in enumerate(samplesizes):
# loop over effect sizes
for eidx,effect in enumerate(meanoffsets):
# generate the data
X = np.random.normal(loc=effect,scale=1.5,size=(ssize))
# run the t-test and store the results
T = stats.ttest_1samp(X,0)
pvals[sidx,eidx] = T.pvalue
# Cohen's d
cohend[sidx,eidx] = np.abs( np.mean(X)/np.std(X,ddof=1) )
# R^2
r2[sidx,eidx] = T.statistic**2 / (T.statistic**2 + T.df)
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(np.log(pvals+np.finfo(float).eps),cohend,'ko',markersize=8,markerfacecolor=(.8,.8,.8))
axs[0].set(xlabel='log(p)',ylabel="Cohen's d")
axs[0].set_title(r"$\bf{A}$) Cohen's d by p-values")
axs[1].plot(cohend,r2,'ko',markersize=8,markerfacecolor=(.8,.8,.8))
axs[1].set(xlabel="Cohen's d",ylabel=r'$R^2$')
axs[1].set_title(r'$\bf{B}$) Different effect size measures')
plt.tight_layout()
plt.savefig('ttest_ex11.png')
plt.show()# Exercise 11 ----
# params
meanoffsets = seq(0, 2, length = 71)
samplesizes = seq(10, 811, by = 50)
# initialize
pvals = matrix(0, nrow = length(samplesizes), ncol=length(meanoffsets))
cohend = matrix(0, nrow=length(samplesizes), ncol=length(meanoffsets))
r2 = matrix(0, nrow=length(samplesizes), ncol=length(meanoffsets))
# loop over sample sizes
for (sidx in seq_along(samplesizes)) {
# sidx = 1
ssize = samplesizes[[sidx]]
# loop over effect sizes
for (eidx in seq_along(meanoffsets)){
# eidx = 1
effect = meanoffsets[[eidx]]
# generate the data
X = rnorm(mean = effect, sd = 1.5, n = ssize)
# run the t-test and store the results
ttest = t.test(X, mu = 0)
pvals[sidx, eidx] = ttest$p.value
# Cohen's d
cohend[sidx, eidx] = abs( mean(X) / sd(X) )
# R^2
r2[sidx, eidx] = ttest$statistic^2 / (ttest$statistic^2 + ttest$parameter)
}
}
# plotting
# panel A:
pE11_A =
ggplot(
data = tibble(data_index = log(as.vector(pvals) + 1e-16),
data_value = as.vector(cohend)),
aes(x = data_index, y = data_value)
) +
geom_point(
size = 3,
fill = rgb(.8, .8, .8),
shape = 21
) +
myTheme +
labs(title = bquote(bold("A)")~"Cohen'd by p-values"),
x = "log(p)", y = "Cohen's d")
# panel V:
pE11_B =
ggplot(
data = tibble(data_index = as.vector(cohend),
data_value = as.vector(r2)),
aes(x = data_index, y = data_value)
) +
geom_point(
size = 3,
fill = rgb(.8, .8, .8),
shape = 21
) +
myTheme +
labs(title = bquote(bold("B)")~"Different effect size measures"),
x = "Cohen's d", y = bquote(R^2))
# final plot
pE11 = pE11_A + pE11_B
pE11# saving
ggsave("ttest_ex11.png", pE11, width=10, height=4)